# Necessary packaces are imported
require(data.table)
## Loading required package: data.table
require(lubridate)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
require(forecast)
## Loading required package: forecast
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
require(skimr)
## Loading required package: skimr
require(repr)
## Loading required package: repr
require(ggplot2)
## Loading required package: ggplot2
require(readxl)
## Loading required package: readxl
require(GGally)
## Loading required package: GGally
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
require(corrplot)
## Loading required package: corrplot
## corrplot 0.92 loaded
Current account basically shows the net amount of monetary operations made by a country. It is one of the most important macroeconomic indicators showing the well-being of an economy. As it affects the other economic policies of the country, it is affected by many indicators as well. The independent variables which are going to be used during time series analysis is listed below:
Brent Petrol Price Index: Trade balance and energy production/consumption is main parameter determining the current account level. Brent petrol prices can be a significant measure considering it leads to changes in both trade balance and cost of energy consumption.
Unemployement: Countries which can not produce its own consumptions have to import their goods and services from other countries which is directly related to current account. Unemployement is going to be a measurement to integrate the effect of the both production fo goods and services and the human capacity. (Labor force could also be used.)
Exchange Rate: In countries with high exchange rate is unlikely to import goods and services and also likely to increase the export the goods and services due to low cost.
Central Bank Reserve: Current account shows if a country is a buyer or a seller. Considering this, if a country is buyer this will consume its reserves and it is a seller its reserves should increase.
c_account_all_data <- read_excel("~/Downloads/EVDS-10.xlsx")
c_account_all_data <- data.table::as.data.table(c_account_all_data)
colnames(c_account_all_data) <- c("Date", "brent_idx", "c_account", "reserve", "unemployement", "ex_rate" )
c_account_all_data$Date <- as.Date(c_account_all_data$Date)
head(c_account_all_data)
## Date brent_idx c_account reserve unemployement ex_rate
## <Date> <num> <num> <num> <num> <num>
## 1: 2014-01-01 108.16 -4519 124270 10.5 2.216836
## 2: 2014-02-01 108.98 -2703 127691 10.5 2.212760
## 3: 2014-03-01 105.95 -3264 126051 9.4 2.217795
## 4: 2014-04-01 108.63 -4201 129732 8.8 2.127471
## 5: 2014-05-01 109.21 -2709 130591 8.4 2.090805
## 6: 2014-06-01 111.03 -3599 133534 9.0 2.115724
str(c_account_all_data)
## Classes 'data.table' and 'data.frame': 120 obs. of 6 variables:
## $ Date : Date, format: "2014-01-01" "2014-02-01" ...
## $ brent_idx : num 108 109 106 109 109 ...
## $ c_account : num -4519 -2703 -3264 -4201 -2709 ...
## $ reserve : num 124270 127691 126051 129732 130591 ...
## $ unemployement: num 10.5 10.5 9.4 8.8 8.4 9 9.7 10.1 10.1 10.9 ...
## $ ex_rate : num 2.22 2.21 2.22 2.13 2.09 ...
## - attr(*, ".internal.selfref")=<externalptr>
c_account_search_data <- read_excel("~/Downloads/multiTimeline-8.xlsx")
c_account_search_data <- data.table::as.data.table(c_account_search_data)
colnames(c_account_search_data) <- c("Date", "c_account_count")
c_account_search_data$Date <- as.Date(c_account_search_data$Date)
c_account_merged_data<- merge(c_account_all_data, c_account_search_data, by = "Date", all = TRUE)
# CHATGPT PROMPT: I want to show both peak and bottom values and labels should be dates with months and years
peak_indices_c <- which(diff(sign(diff(c_account_merged_data$c_account))) == -2) + 1
bottom_indices_c <- which(diff(sign(diff(c_account_merged_data$c_account))) == +2) + 1
peak_changes_c <- abs(c_account_merged_data$c_account[peak_indices_c] - c_account_merged_data$c_account[pmax(peak_indices_c - 2,3)])
bottom_changes_c <- abs(c_account_merged_data$c_account[bottom_indices_c] - c_account_merged_data$c_account[pmax(bottom_indices_c - 2,3)])
significant_peak_indices_c <- peak_indices_c[peak_changes_c > 1500]
significant_bottom_indices_c <- bottom_indices_c[bottom_changes_c > 1500]
combined_indices_c <- union(significant_peak_indices_c, significant_bottom_indices_c)
extreme_data_c <- c_account_merged_data[combined_indices_c, ]
ggplot(c_account_merged_data ,aes(x=Date,y=c_account)) + geom_line() +
geom_point(data = extreme_data_c, aes(x = Date, y = c_account), color = "red", size = 1) +
geom_text(data = extreme_data_c, aes(x = Date, y = c_account, label = format(Date, "%b %Y")), vjust = -1, color = "red", size=3) +
scale_x_date(date_labels = "%b %Y", date_breaks = "6 months") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
We can see that there are peaks in a poisitive way during summer especailly August time. This is because Turkey has tourism income during summer holidays. And in winter time since heat consumption increases current account decreases. Therefore we can suggest that there is yearly seasonality in the data and it should be considered during modeling.
ggplot(c_account_merged_data ,aes(x=Date,y=c_account)) +
geom_line(aes(y = scale(c_account), color = "Current Account")) +
geom_line(aes(y = scale(c_account_count), color = "Search Count of Cari Açık")) +
labs(title = "Current Account vs. Search Count of Cari Açık", x = "Date", y = "Count/Current Account") +
scale_x_date(date_labels = "%b %Y", date_breaks = "6 months") + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = c(0.2, 0.8))
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
“Cari Açık” is Turkish word for current account and it could be one of the best options to follow the trend in current account of Turkey. This plot shows that although the keyword and current account amount of Turkey is not correlated, during both peaks (after Jan 2018) and bottoms (after April 2021) there is interest in the keyword.
ggpairs(c_account_merged_data)
Since our target variable is current account, we can comment on the correlation and relation of this variable with the others. Although there is no information at first sight, we can say that reserve amount has negative correlation with the current account.
Since there is no obvious trend in target variable, we can start decomposing with examining seasonality and trend together.
acf(c_account_merged_data$c_account,36)
We can implement yearly seasonality and examine the residual behavior before implementing independent variables.
c_account_merged_data[,trnd:=1:.N]
c_account_merged_data[,mon:=as.character(month(Date,label=T))]
head(c_account_merged_data)
## Key: <Date>
## Date brent_idx c_account reserve unemployement ex_rate
## <Date> <num> <num> <num> <num> <num>
## 1: 2014-01-01 108.16 -4519 124270 10.5 2.216836
## 2: 2014-02-01 108.98 -2703 127691 10.5 2.212760
## 3: 2014-03-01 105.95 -3264 126051 9.4 2.217795
## 4: 2014-04-01 108.63 -4201 129732 8.8 2.127471
## 5: 2014-05-01 109.21 -2709 130591 8.4 2.090805
## 6: 2014-06-01 111.03 -3599 133534 9.0 2.115724
## c_account_count trnd mon
## <num> <int> <char>
## 1: 48 1 Jan
## 2: 62 2 Feb
## 3: 44 3 Mar
## 4: 43 4 Apr
## 5: 31 5 May
## 6: 29 6 Jun
tmp_c=copy(c_account_merged_data)
tmp_c[,actual:=c_account]
lm_base_c=lm(c_account~trnd+mon,c_account_merged_data)
summary(lm_base_c)
##
## Call:
## lm(formula = c_account ~ trnd + mon, data = c_account_merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6482 -1438 -22 1483 5169
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3072.584 813.570 -3.777 0.000262 ***
## trnd -3.033 6.129 -0.495 0.621711
## monAug 3355.232 1035.209 3.241 0.001587 **
## monDec -764.436 1036.079 -0.738 0.462241
## monFeb 156.434 1034.991 0.151 0.880146
## monJan -537.499 1035.082 -0.519 0.604637
## monJul 1159.499 1035.082 1.120 0.265136
## monJun 1056.066 1034.991 1.020 0.309858
## monMar -445.933 1034.936 -0.431 0.667423
## monMay -560.567 1034.936 -0.542 0.589190
## monNov 1149.631 1035.807 1.110 0.269535
## monOct 3840.098 1035.571 3.708 0.000333 ***
## monSep 3224.365 1035.372 3.114 0.002367 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2314 on 107 degrees of freedom
## Multiple R-squared: 0.3455, Adjusted R-squared: 0.2721
## F-statistic: 4.708 on 12 and 107 DF, p-value: 4e-06
checkresiduals(lm_base_c)
##
## Breusch-Godfrey test for serial correlation of order up to 16
##
## data: Residuals
## LM test = 67.678, df = 16, p-value = 2.536e-08
tmp_c[,predicted_trend_mon:=predict(lm_base_c,tmp_c)]
tmp_c[,residual_trend_mon:=actual-predicted_trend_mon]
ggplot(tmp_c ,aes(x=Date)) +
geom_line(aes(y=actual,color='real')) +
geom_line(aes(y=predicted_trend_mon,color='predicted'))
acf(tmp_c$residual_trend_mon,60)
Although we are already close to white noise, there is still improvement that should be implemented. There is cyclic behaviour in the residuals’s autocorrelation but it repeats in 14-15 months. Since I could not justify that, I will not implement it. Let’s plot the residuals and independent variables.
p1_c <- ggplot(tmp_c, aes(x=brent_idx, y=residual_trend_mon)) +
geom_point()
p2_c <- ggplot(tmp_c, aes(x=reserve, y=residual_trend_mon)) +
geom_point()
p3_c <- ggplot(tmp_c, aes(x=unemployement, y=residual_trend_mon)) +
geom_point()
p4_c <- ggplot(tmp_c, aes(x=ex_rate, y=residual_trend_mon)) +
geom_point()
p5_c <- ggplot(tmp_c, aes(x=c_account_count, y=residual_trend_mon)) +
geom_point()
gridExtra::grid.arrange(p1_c, p2_c, p3_c, p4_c, p5_c, nrow=2)
We can implement the unemployement and reserve variables into the model since they show somehow correlations with residuals. But in order not to risk multicolineartiy, let’s first implement reserve and exchange rate variable.
lm_base_c2=lm(c_account~trnd+mon+reserve,c_account_merged_data)
summary(lm_base_c2)
##
## Call:
## lm(formula = c_account ~ trnd + mon + reserve, data = c_account_merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5369.2 -1413.6 -77.9 1393.8 4683.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2975.19584 1979.92056 1.503 0.135894
## trnd -10.50578 6.27739 -1.674 0.097160 .
## monAug 3584.16877 992.23536 3.612 0.000466 ***
## monDec -428.68470 995.81434 -0.430 0.667716
## monFeb 402.83107 992.40671 0.406 0.685625
## monJan -320.05608 991.87984 -0.323 0.747576
## monJul 1273.63471 990.31361 1.286 0.201213
## monJun 1056.45849 989.63057 1.068 0.288159
## monMar -407.81418 989.64505 -0.412 0.681112
## monMay -629.78719 989.79790 -0.636 0.525966
## monNov 1570.68994 998.49006 1.573 0.118684
## monOct 4166.77418 995.05764 4.187 5.85e-05 ***
## monSep 3393.75395 991.30740 3.424 0.000880 ***
## reserve -0.05237 0.01577 -3.322 0.001228 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2213 on 106 degrees of freedom
## Multiple R-squared: 0.4072, Adjusted R-squared: 0.3345
## F-statistic: 5.602 on 13 and 106 DF, p-value: 1.122e-07
checkresiduals(lm_base_c2)
##
## Breusch-Godfrey test for serial correlation of order up to 17
##
## data: Residuals
## LM test = 62.613, df = 17, p-value = 3.869e-07
tmp_c[,predicted_trend_mon_res:=predict(lm_base_c2,tmp_c)]
tmp_c[,residual_trend_mon_res:=actual-predicted_trend_mon_res]
ggplot(tmp_c ,aes(x=Date)) +
geom_line(aes(y=actual,color='real')) +
geom_line(aes(y=predicted_trend_mon_res,color='predicted'))
acf(tmp_c$residual_trend_mon_res,60)
Unfortunately, we still see serial correlation in residuals. This can be handles with differencing but to implement differencing, we should remove seasonality variables from the data.
c_account_merged_data[,lag_1_c_account:=shift(c_account,1)]
c_account_merged_data[,lag_1_diff:=c_account-lag_1_c_account]
tmp_c[,lag_1_c_account:=shift(c_account,1)]
tmp_c[,actual_lag_1_diff:=c_account-lag_1_c_account]
c_account_merged_data = c_account_merged_data[complete.cases(c_account_merged_data)]
acf(c_account_merged_data$lag_1_diff)
acf(c_account_merged_data$lag_1_diff,lag=60)
We can observe that differencing prevents the serial correlation but yearly seasonality should still be implemented.
lm_base_c3=lm(lag_1_diff~mon,c_account_merged_data)
summary(lm_base_c3)
##
## Call:
## lm(formula = lag_1_diff ~ mon, data = c_account_merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6531.4 -833.8 -101.9 1093.8 6950.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 442.9 632.9 0.700 0.48559
## monAug 1749.8 895.1 1.955 0.05321 .
## monDec -2360.0 895.1 -2.637 0.00962 **
## monFeb 248.0 895.1 0.277 0.78226
## monJan 112.2 919.6 0.122 0.90311
## monJul -342.5 895.1 -0.383 0.70274
## monJun 1170.7 895.1 1.308 0.19371
## monMar -1048.3 895.1 -1.171 0.24413
## monMay -1006.5 895.1 -1.124 0.26333
## monNov -3136.4 895.1 -3.504 0.00067 ***
## monOct 169.8 895.1 0.190 0.84990
## monSep -576.8 895.1 -0.644 0.52069
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2001 on 107 degrees of freedom
## Multiple R-squared: 0.3222, Adjusted R-squared: 0.2525
## F-statistic: 4.624 on 11 and 107 DF, p-value: 9.674e-06
checkresiduals(lm_base_c3)
##
## Breusch-Godfrey test for serial correlation of order up to 15
##
## data: Residuals
## LM test = 44.503, df = 15, p-value = 9.171e-05
tmp_c[,predicted_diff_mon:=predict(lm_base_c3,tmp_c)]
tmp_c[,residual_diff_mon:=actual_lag_1_diff-predicted_diff_mon]
ggplot(tmp_c ,aes(x=Date)) +
geom_line(aes(y=actual,color='real')) +
geom_line(aes(y=predicted_diff_mon+lag_1_c_account,color='predicted'))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
Throughout the analysis, firstly regression analysis was held but serial correlation did not vanished. Later any relation between independent and residuals could not be found. Therefore differencing is applied to get rid of serial correlation. After lag 1 difference, there were still relation in yearly seasonality, so yearly seasonality is also implemented to model. Results show that we reach the white noise and residuals are not correlated and almost between specified range (blue dotted lines). Predicted values and actual values are also plotted. And since previous current account could be a good indicator in terms of predicting next month, implementing lag1 differencing makes sense. (I am not sure if implementing the yearly seasonality to differencing model by using dummy variables instead of somehow integrating lag 12 difference but it decreased the correlation at the end)
Housing sales are one of the most significant indicators of economic growth of a country. Although people usually tends to buy their own houses, there are many other factors for to buy or sell a house such as interest rates, other investment options, inflation rates etc. Independent variables which are chosen to be used in time series analysis is listed below:
BIST 100 Index: Borsa Istanbul is an important option to when it comes to investment. It indicates the health of the companies operating in Turkey and when it is tend to increase, buying houses becomes less attractive to invest in.
Interest Rate: Investing the money in the bank for a term and receiving interest and payments with it is another option for investment. Again, when interest rates increases, housing sales drops.
Consumer Price Index: Inflation rate and CPI are the measures the value of the money in that country. Since housing market is investment option besides owning a house, it can be used as a protection from inflation.
Unit Prices: Prices of houses and overall house market prices also determines the amount of sales in a way. Demands and supplies for houses is also critical for this.
house_sales_all_data <- read_excel("~/Downloads/EVDS-11.xlsx")
house_sales_all_data <- data.table::as.data.table(house_sales_all_data)
colnames(house_sales_all_data) <- c("Date", "house_sales", "bist_idx", "interest_rate", "c_price_idx", "unit_price" )
house_sales_all_data$Date <- as.Date(house_sales_all_data$Date)
head(house_sales_all_data)
## Date house_sales bist_idx interest_rate c_price_idx unit_price
## <Date> <num> <num> <num> <num> <num>
## 1: 2014-01-01 87639 618.10 7.3140 233.54 1510.3
## 2: 2014-02-01 82597 626.04 9.1850 234.54 1524.5
## 3: 2014-03-01 87617 694.98 9.7125 237.18 1541.3
## 4: 2014-04-01 83610 734.35 9.6100 240.37 1575.2
## 5: 2014-05-01 90377 794.46 9.2820 241.32 1597.8
## 6: 2014-06-01 92936 790.40 8.9700 242.07 1628.4
str(house_sales_all_data)
## Classes 'data.table' and 'data.frame': 120 obs. of 6 variables:
## $ Date : Date, format: "2014-01-01" "2014-02-01" ...
## $ house_sales : num 87639 82597 87617 83610 90377 ...
## $ bist_idx : num 618 626 695 734 794 ...
## $ interest_rate: num 7.31 9.19 9.71 9.61 9.28 ...
## $ c_price_idx : num 234 235 237 240 241 ...
## $ unit_price : num 1510 1524 1541 1575 1598 ...
## - attr(*, ".internal.selfref")=<externalptr>
house_sales_search_data <- read_excel("~/Downloads/multiTimeline-13.xlsx")
house_sales_search_data <- data.table::as.data.table(house_sales_search_data)
colnames(house_sales_search_data) <- c("Date", "tapu_search_count")
house_sales_search_data$Date <- as.Date(house_sales_search_data$Date)
house_sales_merged_data<- merge(house_sales_all_data, house_sales_search_data, by = "Date", all = TRUE)
peak_indices_h <- which(diff(sign(diff(house_sales_merged_data$house_sales))) == -2) + 1
bottom_indices_h <- which(diff(sign(diff(house_sales_merged_data$house_sales))) == +2) + 1
peak_changes_h <- abs(house_sales_merged_data$house_sales[peak_indices_h] - house_sales_merged_data$house_sales[pmax(peak_indices_h - 1,1)])
bottom_changes_h <- abs(house_sales_merged_data$house_sales[bottom_indices_h] - house_sales_merged_data$house_sales[pmax(bottom_indices_h - 1,1)])
significant_peak_indices_h <- peak_indices_h[peak_changes_h > 14000]
significant_bottom_indices_h <- bottom_indices_h[bottom_changes_h > 14000]
combined_indices_h <- union(significant_peak_indices_h, significant_bottom_indices_h)
extreme_data_h <- house_sales_merged_data[combined_indices_h, ]
ggplot(house_sales_merged_data ,aes(x=Date,y=house_sales)) + geom_line() +
geom_point(data = extreme_data_h, aes(x = Date, y = house_sales), color = "blue", size = 1) +
geom_text(data = extreme_data_h, aes(x = Date, y = house_sales, label = format(Date, "%b %Y")), vjust = -1, color = "blue", size=2.5) +
scale_x_date(date_labels = "%b %Y", date_breaks = "6 months") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
In this plot, obvious peaks in December time and significant drops after that can be seen. Significant fluctuations between July 2019 and Jan 2023 may be result of interest rates or other saving and investment options. (Especially due to covid conditions)
ggplot(house_sales_merged_data ,aes(x=Date,y=house_sales)) +
geom_line(aes(y = scale(house_sales), color = "House Sales")) +
geom_line(aes(y = scale(tapu_search_count), color = "Search Count of Tapu")) +
labs(title = "House Sales vs. Search Count of Tapu", x = "Date", y = "Count/House Sales") +
scale_x_date(date_labels = "%b %Y", date_breaks = "6 months") + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = c(0.2, 0.8))
“Tapu” is a Turkish word representing the processes and documents during house buying or selling. It is a strong word to see the trends in house sales as we can see in the plot. There is an upward trend in search count since some of the processes is held online in time but fluctuations in both housing sales and search count is correlated.
ggpairs(house_sales_merged_data)
In this plot, when we observe the relation of other variables with housing sales (target variable), we can see that interest rates and bist index have somehow negative correlation and also both consumer price index and house unit prices show negative correlation. It would be risky to use both variable in a model since they are strongly correlated between themselves and that would effect the coefficients and interpretability of the model.
Like in current account data, since there is no obvious trend in target variable, we can start decomposing with examining seasonality and trend together.
acf(house_sales_merged_data$house_sales,36)
We can implement yearly seasonality and examine the residual behavior before implementing independent variables.
house_sales_merged_data[,trnd:=1:.N]
house_sales_merged_data[,mon:=as.character(month(Date,label=T))]
head(house_sales_merged_data)
## Key: <Date>
## Date house_sales bist_idx interest_rate c_price_idx unit_price
## <Date> <num> <num> <num> <num> <num>
## 1: 2014-01-01 87639 618.10 7.3140 233.54 1510.3
## 2: 2014-02-01 82597 626.04 9.1850 234.54 1524.5
## 3: 2014-03-01 87617 694.98 9.7125 237.18 1541.3
## 4: 2014-04-01 83610 734.35 9.6100 240.37 1575.2
## 5: 2014-05-01 90377 794.46 9.2820 241.32 1597.8
## 6: 2014-06-01 92936 790.40 8.9700 242.07 1628.4
## tapu_search_count trnd mon
## <num> <int> <char>
## 1: 14 1 Jan
## 2: 15 2 Feb
## 3: 14 3 Mar
## 4: 16 4 Apr
## 5: 19 5 May
## 6: 20 6 Jun
tmp_h=copy(house_sales_merged_data)
tmp_h[,actual:=house_sales]
lm_base_h=lm(house_sales~trnd+mon,house_sales_merged_data)
summary(lm_base_h)
##
## Call:
## lm(formula = house_sales ~ trnd + mon, data = house_sales_merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57448 -14915 -2443 14194 112588
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90909.29 9157.71 9.927 < 2e-16 ***
## trnd 122.66 68.99 1.778 0.0783 .
## monAug 24097.67 11652.52 2.068 0.0410 *
## monDec 58084.53 11662.32 4.981 2.45e-06 ***
## monFeb -4499.68 11650.07 -0.386 0.7001
## monJan -8263.22 11651.09 -0.709 0.4797
## monJul 16169.32 11651.09 1.388 0.1681
## monJun 16432.68 11650.07 1.411 0.1613
## monMar 14627.46 11649.46 1.256 0.2120
## monMay 589.94 11649.46 0.051 0.9597
## monNov 20697.19 11659.26 1.775 0.0787 .
## monOct 20804.65 11656.60 1.785 0.0771 .
## monSep 24529.21 11654.36 2.105 0.0377 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26050 on 107 degrees of freedom
## Multiple R-squared: 0.3445, Adjusted R-squared: 0.271
## F-statistic: 4.686 on 12 and 107 DF, p-value: 4.29e-06
checkresiduals(lm_base_h)
##
## Breusch-Godfrey test for serial correlation of order up to 16
##
## data: Residuals
## LM test = 32.374, df = 16, p-value = 0.008935
tmp_h[,predicted_trend_mon:=predict(lm_base_h,tmp_h)]
tmp_h[,residual_trend_mon:=actual-predicted_trend_mon]
ggplot(tmp_h ,aes(x=Date)) +
geom_line(aes(y=actual,color='real')) +
geom_line(aes(y=predicted_trend_mon,color='predicted'))
acf(tmp_h$residual_trend_mon,60)
Although we removed yearly seasonality from the data, R-squared value shows that the model performs poorly. Let’s examine the relation between the residuals and independent varibales.
p1_h <- ggplot(tmp_h, aes(x=bist_idx, y=residual_trend_mon)) +
geom_point()
p2_h <- ggplot(tmp_h, aes(x=interest_rate, y=residual_trend_mon)) +
geom_point()
p3_h <- ggplot(tmp_h, aes(x=c_price_idx, y=residual_trend_mon)) +
geom_point()
p4_h <- ggplot(tmp_h, aes(x=unit_price, y=residual_trend_mon)) +
geom_point()
p5_h <- ggplot(tmp_h, aes(x=tapu_search_count, y=residual_trend_mon)) +
geom_point()
gridExtra::grid.arrange(p1_h, p2_h, p3_h, p4_h, p5_h, nrow=2)
To avoid multicolinearity, let’s implement interest rates (since there seems a negative correlation) information to model.
lm_base_h2=lm(house_sales~trnd+mon+interest_rate,house_sales_merged_data)
summary(lm_base_h2)
##
## Call:
## lm(formula = house_sales ~ trnd + mon + interest_rate, data = house_sales_merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71233 -11766 421 11260 92357
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 104225.9 8889.8 11.724 < 2e-16 ***
## trnd 381.7 84.8 4.501 1.74e-05 ***
## monAug 24696.7 10693.3 2.310 0.0229 *
## monDec 62353.0 10741.8 5.805 6.79e-08 ***
## monFeb -5093.1 10691.0 -0.476 0.6348
## monJan -8700.3 10691.6 -0.814 0.4176
## monJul 17856.0 10697.5 1.669 0.0980 .
## monJun 18637.0 10701.0 1.742 0.0845 .
## monMar 14066.6 10690.4 1.316 0.1911
## monMay 1101.9 10690.3 0.103 0.9181
## monNov 23674.8 10718.3 2.209 0.0293 *
## monOct 23719.4 10715.1 2.214 0.0290 *
## monSep 26801.1 10705.6 2.503 0.0138 *
## interest_rate -2193.0 477.7 -4.591 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23900 on 106 degrees of freedom
## Multiple R-squared: 0.4532, Adjusted R-squared: 0.3862
## F-statistic: 6.759 on 13 and 106 DF, p-value: 2.721e-09
checkresiduals(lm_base_h2)
##
## Breusch-Godfrey test for serial correlation of order up to 17
##
## data: Residuals
## LM test = 28.947, df = 17, p-value = 0.03502
tmp_h[,predicted_trend_mon_int:=predict(lm_base_h2,tmp_h)]
tmp_h[,residual_trend_mon_int:=actual-predicted_trend_mon_int]
ggplot(tmp_h ,aes(x=Date)) +
geom_line(aes(y=actual,color='real')) +
geom_line(aes(y=predicted_trend_mon_int,color='predicted'))
p1_h2 <- ggplot(tmp_h, aes(x=bist_idx, y=residual_trend_mon_int)) +
geom_point()
p2_h2 <- ggplot(tmp_h, aes(x=interest_rate, y=residual_trend_mon_int)) +
geom_point()
p3_h2 <- ggplot(tmp_h, aes(x=c_price_idx, y=residual_trend_mon_int)) +
geom_point()
p4_h2 <- ggplot(tmp_h, aes(x=unit_price, y=residual_trend_mon_int)) +
geom_point()
p5_h2 <- ggplot(tmp_h, aes(x=tapu_search_count, y=residual_trend_mon_int)) +
geom_point()
gridExtra::grid.arrange(p1_h2, p2_h2, p3_h2, p4_h2, p5_h2, nrow=2)
Let’s implement the search count of “Tapu” word to somehow integrate fluctuations into the model.
lm_base_h3=lm(house_sales~trnd+mon+interest_rate+tapu_search_count,house_sales_merged_data)
summary(lm_base_h3)
##
## Call:
## lm(formula = house_sales ~ trnd + mon + interest_rate + tapu_search_count,
## data = house_sales_merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55560 -11183 -850 11757 47123
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 84505.5 7787.2 10.852 < 2e-16 ***
## trnd -471.9 136.9 -3.447 0.000817 ***
## monAug 12092.3 8946.2 1.352 0.179387
## monDec 50940.6 8954.8 5.689 1.17e-07 ***
## monFeb -8026.0 8782.7 -0.914 0.362889
## monJan -10150.1 8776.1 -1.157 0.250076
## monJul 3504.2 8999.7 0.389 0.697795
## monJun 5564.8 8965.3 0.621 0.536134
## monMar 7550.0 8818.8 0.856 0.393877
## monMay -1897.1 8782.5 -0.216 0.829402
## monNov 14275.6 8891.0 1.606 0.111361
## monOct 15499.5 8866.1 1.748 0.083354 .
## monSep 16099.0 8908.8 1.807 0.073613 .
## interest_rate -958.1 427.5 -2.241 0.027123 *
## tapu_search_count 1424.3 196.8 7.239 7.77e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19610 on 105 degrees of freedom
## Multiple R-squared: 0.6353, Adjusted R-squared: 0.5866
## F-statistic: 13.06 on 14 and 105 DF, p-value: < 2.2e-16
checkresiduals(lm_base_h3)
##
## Breusch-Godfrey test for serial correlation of order up to 18
##
## data: Residuals
## LM test = 51.414, df = 18, p-value = 4.601e-05
tmp_h[,predicted_trend_mon_int_count:=predict(lm_base_h3,tmp_h)]
tmp_h[,residual_trend_mon_int_count:=actual-predicted_trend_mon_int_count]
ggplot(tmp_h ,aes(x=Date)) +
geom_line(aes(y=actual,color='real')) +
geom_line(aes(y=predicted_trend_mon_int_count,color='predicted'))
acf(tmp_h$residual_trend_mon_int_count,60)
We started to time series data decomposition by implementing trend and yearly seasonality variables after we examined the autocorrelation of the target variable. Although we saw that residuals reached to white noise model, metrics and residual-independent variables scatter plots confirmed that there could be improvement. To avoid multicolinearity in the model we added only interest rate to the lineer model. After that to implement fluctuations to the model, search count of “Tapu” word is also added to the model. As it can be seen in both R-square metric and actual-predicted value plot model showed great improvement.
Large number of investors consider term deposit (vadeli mevduat in Turkish) as a safe investment. Although it attracts many people, overall term deposit amount in the banks fluctuates especially on the countries with fluctuating interest rates and inflation. In this time series analysis, term deposit amount in Turkish Liras will be analyzed with the independent variables below:
Exchange Rates: While one option is incesting in term deposit, investing in currency is another option to directly effects the amount of deposit in the banks. Increasing exchange rates could lead people to invest in currency instead of term deposits.
Interest Rate: High interest rates are the main reason people want to invest their money in bank deposits. High interest rates (especially higher than inflation or increase in exchange rate) would lead to higher amount of money in bank term deposits.
Consumer Price Index: Inflation rate and CPI are the main reasons people want to invest their money and protect their buying power. High inflation discourages people to invest in term deposits since the real value of the money in the deposit may decrease as time goes.
House Price Index: Although this does not necessarily affects the amount of money in bank term deposits, house price index can be seen as an alternative option to invest depending on the amount of interest rates.
term_deposit_all_data <- read_excel("~/Downloads/EVDS-12.xlsx")
term_deposit_all_data <- data.table::as.data.table(term_deposit_all_data)
colnames(term_deposit_all_data) <- c("Date", "ex_rate", "interest_rate", "deposit_amount", "consumer_price_idx", "house_price_idx")
term_deposit_all_data$Date <- as.Date(term_deposit_all_data$Date)
head(term_deposit_all_data)
## Date ex_rate interest_rate deposit_amount consumer_price_idx
## <Date> <num> <num> <num> <num>
## 1: 2014-01-01 2.216836 7.3140 71157129 233.54
## 2: 2014-02-01 2.212760 9.1850 69734191 234.54
## 3: 2014-03-01 2.217795 9.7125 64678809 237.18
## 4: 2014-04-01 2.127471 9.6100 66501577 240.37
## 5: 2014-05-01 2.090805 9.2820 70765088 241.32
## 6: 2014-06-01 2.115724 8.9700 73326895 242.07
## house_price_idx
## <num>
## 1: 64.1
## 2: 64.8
## 3: 65.7
## 4: 66.8
## 5: 67.5
## 6: 68.2
str(term_deposit_all_data)
## Classes 'data.table' and 'data.frame': 120 obs. of 6 variables:
## $ Date : Date, format: "2014-01-01" "2014-02-01" ...
## $ ex_rate : num 2.22 2.21 2.22 2.13 2.09 ...
## $ interest_rate : num 7.31 9.19 9.71 9.61 9.28 ...
## $ deposit_amount : num 71157129 69734191 64678809 66501577 70765088 ...
## $ consumer_price_idx: num 234 235 237 240 241 ...
## $ house_price_idx : num 64.1 64.8 65.7 66.8 67.5 68.2 69.3 70.2 70.9 71.5 ...
## - attr(*, ".internal.selfref")=<externalptr>
term_deposit_search_data <- read_excel("~/Downloads/multiTimeline-14.xlsx")
term_deposit_search_data <- data.table::as.data.table(term_deposit_search_data)
colnames(term_deposit_search_data) <- c("Date", "vadeli_mevduat_count")
term_deposit_search_data$Date <- as.Date(term_deposit_search_data$Date)
term_deposit_merged_data<- merge(term_deposit_all_data, term_deposit_search_data, by = "Date", all = TRUE)
peak_indices_t <- which(diff(sign(diff(term_deposit_merged_data$deposit_amount))) == -2) + 1
bottom_indices_t <- which(diff(sign(diff(term_deposit_merged_data$deposit_amount))) == +2) + 1
# Calculate percentage changes for peaks and troughs
peak_percentage_changes_t <- 100 * abs(((term_deposit_merged_data$deposit_amount[peak_indices_t] / term_deposit_merged_data$deposit_amount[pmax(peak_indices_t - 1, 1)]) - 1))
bottom_percentage_changes_t <- 100 * abs(((term_deposit_merged_data$deposit_amount[bottom_indices_t] / term_deposit_merged_data$deposit_amount[pmax(bottom_indices_t - 1, 1)]) - 1))
# Filter significant peaks and troughs based on percentage change
significant_peak_indices_t <- peak_indices_t[peak_percentage_changes_t > 6] # Adjust the threshold as needed
significant_bottom_indices_t <- bottom_indices_t[bottom_percentage_changes_t > 6] # Adjust the threshold as needed
# Combine indices for significant peaks and troughs
combined_indices_t <- union(significant_peak_indices_t, significant_bottom_indices_t)
# Extract data for significant peaks and troughs
extreme_data_t <- term_deposit_merged_data[combined_indices_t, ]
# Plot with logarithmic scale and percentage labels
ggplot(term_deposit_merged_data, aes(x = Date, y = deposit_amount)) +
geom_line() +
scale_y_log10() + # Apply logarithmic scale
geom_point(data = extreme_data_t, aes(x = Date, y = deposit_amount), color = "purple", size = 2) +
geom_text(data = extreme_data_t, aes(x = Date, y = deposit_amount, label=format(Date, "%b %Y")), vjust = -1, color = "purple", size = 2) +
scale_x_date(date_labels = "%b %Y", date_breaks = "6 months") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Term deposits show a clear uptrend through time. Although there could be many factors forcing this trend, the main reason should be the exchange rates and inflation.
ggplot(term_deposit_merged_data ,aes(x=Date,y=deposit_amount)) +
geom_line(aes(y = scale(deposit_amount), color = "Term Deposit Amount")) +
geom_line(aes(y = scale(vadeli_mevduat_count), color = "Search Count of Vadeli Mevduat")) +
labs(title = "Term Deposit Amount vs. Search Count of Vadeli Mevduat", x = "Date", y = "Count/Deposit Amount") +
scale_x_date(date_labels = "%b %Y", date_breaks = "6 months") + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = c(0.2, 0.8))
“Vadeli mevduat” is a Turkish word for term deposit. Although it is not obvious due to scaling at the beginning of the plot, there is correlation between search count and amount of term deposits. After 2018 both of them shows upward trend. Search count showed some picks during some intervals. This could be due to sudden government policies in economy.
ggpairs(term_deposit_merged_data)
In this plot, deposit amount shows strong correlation with many independent variables. This would be nice if the correlation among this independent variables would not be high as well (Multicolinearity). This should be taken care of during modeling.
In this target variable we see an obvious trend upwards throghout the years. Let’s see the autocorrelation plot of target variable.
acf(term_deposit_merged_data$deposit_amount,36)
Seasonality does not show up in the autocorrelation.
term_deposit_merged_data[,trnd:=1:.N]
head(term_deposit_merged_data)
## Key: <Date>
## Date ex_rate interest_rate deposit_amount consumer_price_idx
## <Date> <num> <num> <num> <num>
## 1: 2014-01-01 2.216836 7.3140 71157129 233.54
## 2: 2014-02-01 2.212760 9.1850 69734191 234.54
## 3: 2014-03-01 2.217795 9.7125 64678809 237.18
## 4: 2014-04-01 2.127471 9.6100 66501577 240.37
## 5: 2014-05-01 2.090805 9.2820 70765088 241.32
## 6: 2014-06-01 2.115724 8.9700 73326895 242.07
## house_price_idx vadeli_mevduat_count trnd
## <num> <num> <int>
## 1: 64.1 14 1
## 2: 64.8 14 2
## 3: 65.7 8 3
## 4: 66.8 7 4
## 5: 67.5 7 5
## 6: 68.2 10 6
tmp_t=copy(term_deposit_merged_data)
tmp_t[,actual:=deposit_amount]
lm_base_t=lm(deposit_amount~trnd,term_deposit_merged_data)
summary(lm_base_t)
##
## Call:
## lm(formula = deposit_amount ~ trnd, data = term_deposit_merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -205698500 -139043568 -55135080 110781709 534016332
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -128276253 31237898 -4.106 7.44e-05 ***
## trnd 6959952 448081 15.533 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.7e+08 on 118 degrees of freedom
## Multiple R-squared: 0.6716, Adjusted R-squared: 0.6688
## F-statistic: 241.3 on 1 and 118 DF, p-value: < 2.2e-16
checkresiduals(lm_base_t)
##
## Breusch-Godfrey test for serial correlation of order up to 10
##
## data: Residuals
## LM test = 114.86, df = 10, p-value < 2.2e-16
tmp_t[,predicted_trend:=predict(lm_base_t,tmp_t)]
tmp_t[,residual_trend:=actual-predicted_trend]
Residuals show high autocorrelation. Although this can be handled with differencing let’s first examine the independent variables.
p1_t <- ggplot(tmp_t, aes(x=ex_rate, y=residual_trend)) +
geom_point()
p2_t <- ggplot(tmp_t, aes(x=interest_rate, y=residual_trend)) +
geom_point()
p3_t <- ggplot(tmp_t, aes(x=consumer_price_idx, y=residual_trend)) +
geom_point()
p4_t <- ggplot(tmp_t, aes(x=house_price_idx, y=residual_trend)) +
geom_point()
p5_t <- ggplot(tmp_t, aes(x=vadeli_mevduat_count, y=residual_trend)) +
geom_point()
gridExtra::grid.arrange(p1_t, p2_t, p3_t, p4_t, p5_t, nrow=2)
For the sake of increasing accuracy of the model, lineer terms are added to the model.
term_deposit_merged_data[,ex_rate_sq:=ex_rate^2]
term_deposit_merged_data[,consumer_price_idx_sq:=consumer_price_idx^2]
tmp_t[,ex_rate_sq:=ex_rate^2]
tmp_t[,consumer_price_idx_sq:=consumer_price_idx^2]
lm_base_t1 = lm(deposit_amount~.,data=term_deposit_merged_data)
summary(lm_base_t1)
##
## Call:
## lm(formula = deposit_amount ~ ., data = term_deposit_merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -123764228 -22379406 41345 19816644 146213075
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.746e+09 8.430e+10 0.021 0.98351
## Date -1.064e+05 5.255e+06 -0.020 0.98389
## ex_rate 2.684e+07 1.753e+07 1.531 0.12855
## interest_rate -6.125e+06 1.346e+06 -4.552 1.38e-05 ***
## consumer_price_idx -4.275e+05 3.935e+05 -1.086 0.27969
## house_price_idx 1.241e+06 1.456e+05 8.524 9.12e-14 ***
## vadeli_mevduat_count 7.452e+05 5.451e+05 1.367 0.17437
## trnd 4.840e+06 1.600e+08 0.030 0.97592
## ex_rate_sq -1.273e+06 4.622e+05 -2.753 0.00691 **
## consumer_price_idx_sq 2.020e+02 1.382e+02 1.461 0.14686
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38580000 on 110 degrees of freedom
## Multiple R-squared: 0.9842, Adjusted R-squared: 0.9829
## F-statistic: 763 on 9 and 110 DF, p-value: < 2.2e-16
checkresiduals(lm_base_t1)
##
## Breusch-Godfrey test for serial correlation of order up to 13
##
## data: Residuals
## LM test = 68.831, df = 13, p-value = 1.317e-09
tmp_t[,predicted_trend_sq:=predict(lm_base_t1,tmp_t)]
tmp_t[,residual_trend_sq:=actual-predicted_trend_sq]
p1_t2 <- ggplot(tmp_t, aes(x=ex_rate, y=residual_trend_sq)) +
geom_point()
p2_t2 <- ggplot(tmp_t, aes(x=interest_rate, y=residual_trend_sq)) +
geom_point()
p3_t2 <- ggplot(tmp_t, aes(x=consumer_price_idx, y=residual_trend_sq)) +
geom_point()
p4_t2 <- ggplot(tmp_t, aes(x=house_price_idx, y=residual_trend_sq)) +
geom_point()
p5_t2 <- ggplot(tmp_t, aes(x=vadeli_mevduat_count, y=residual_trend_sq)) +
geom_point()
gridExtra::grid.arrange(p1_t2, p2_t2, p3_t2, p4_t2, p5_t2, nrow=2)
ggplot(tmp_t ,aes(x=Date)) +
geom_line(aes(y=actual,color='real')) +
geom_line(aes(y=predicted_trend_sq,color='predicted'))
Term deposit amount in Turkey has shown upwards trend in years. There were no evidence indicating any seasonality. Therefore trend variable is implemented to the model. There were high serial correlation in the residuals. Residuals had relation with independent variables more similar to square terms. Square terms of independent variables has been implemented but lineer terms showed better performance in terms of serial correlation of residuals.
all_target_data <- cbind(c_account_all_data$c_account, house_sales_all_data$house_sales, term_deposit_all_data$deposit_amount)
all_target_data <- as.data.table(all_target_data)
setnames(all_target_data, c("c_account", "house_sales", "deposit_amount"))
print(all_target_data)
## c_account house_sales deposit_amount
## <num> <num> <num>
## 1: -4519 87639 71157129
## 2: -2703 82597 69734191
## 3: -3264 87617 64678809
## 4: -4201 83610 66501577
## 5: -2709 90377 70765088
## ---
## 116: -376 122091 991320641
## 117: 1996 102656 1001188177
## 118: 122 93761 1125552602
## 119: -2796 93514 1044564495
## 120: -2126 138577 1240934320
# CHATGPT PROMPT: I need to create a data table taking one column from each data table and merge them.
cor_matrix <- cor(all_target_data)
corrplot(cor_matrix, method = "color", type = "upper", order = "hclust",
addCoef.col = "black",
tl.col = "black", tl.srt = 45)